Setup

Load Libraries

library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(ggrepel)
library(curl)
library(biomaRt)
library(sqldf)
# Ensembl LD API
library(httr)
library(jsonlite)
library(xml2)  
library(gaston)
library(RCurl)
 
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR") 

# *** finemapr ****
## finemapr contains: finemap, CAVIAR, and PAINTOR
# library(finemapr) # devtools::install_github("variani/finemapr")
library(roxygen2) #roxygenize()  

# *** locuscomparer ****
# https://github.com/boxiangliu/locuscomparer
library(locuscomparer); #devtools::install_github("boxiangliu/locuscomparer")

# thm <- knitr::knit_theme$get("bipolar")
# knitr::knit_theme$set(thm)

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] locuscomparer_1.0.0 roxygen2_6.1.1      susieR_0.6.2.0411  
##  [4] RCurl_1.95-4.11     bitops_1.0-6        gaston_1.5.4       
##  [7] RcppParallel_4.4.2  Rcpp_1.0.0          xml2_1.2.0         
## [10] jsonlite_1.6        httr_1.4.0          sqldf_0.4-11       
## [13] RSQLite_2.1.1       gsubfn_0.7          proto_1.0.0        
## [16] biomaRt_2.38.0      curl_3.3            ggrepel_0.8.0      
## [19] cowplot_0.9.4       plotly_4.8.0        ggplot2_3.1.0      
## [22] dplyr_0.7.8         data.table_1.12.0   DT_0.5.1           
## [25] readxl_1.2.0       
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.42.0       tidyr_0.8.2          bit64_0.9-7         
##  [4] viridisLite_0.3.0    assertthat_0.2.0     expm_0.999-3        
##  [7] stats4_3.5.1         blob_1.1.1           cellranger_1.1.0    
## [10] yaml_2.2.0           progress_1.2.0       pillar_1.3.1        
## [13] lattice_0.20-38      glue_1.3.0           chron_2.3-53        
## [16] digest_0.6.18        colorspace_1.4-0     htmltools_0.3.6     
## [19] Matrix_1.2-15        plyr_1.8.4           XML_3.98-1.16       
## [22] pkgconfig_2.0.2      purrr_0.3.0          scales_1.0.0        
## [25] tibble_2.0.1         IRanges_2.16.0       withr_2.1.2         
## [28] BiocGenerics_0.28.0  lazyeval_0.2.1       magrittr_1.5        
## [31] crayon_1.3.4         memoise_1.1.0        evaluate_0.12       
## [34] tools_3.5.1          prettyunits_1.0.2    hms_0.4.2           
## [37] stringr_1.3.1        S4Vectors_0.20.1     munsell_0.5.0       
## [40] AnnotationDbi_1.44.0 bindrcpp_0.2.2       compiler_3.5.1      
## [43] rlang_0.3.1          grid_3.5.1           htmlwidgets_1.3     
## [46] tcltk_3.5.1          rmarkdown_1.11       gtable_0.2.0        
## [49] DBI_1.0.0            R6_2.3.0             knitr_1.21          
## [52] bit_1.1-14           bindr_0.1.1          commonmark_1.7      
## [55] stringi_1.2.4        parallel_3.5.1       tidyselect_0.2.5    
## [58] xfun_0.4
print(paste("susieR ", packageVersion("susieR")))
## [1] "susieR  0.6.2.411"

General Functions

createDT <- function(DF, caption="", scrollY=400){
  data <- DT::datatable(DF, caption=caption,
    extensions = 'Buttons',
    options = list( dom = 'Bfrtip', 
                    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'), 
                    scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,  
                      columnDefs = list(list(className = 'dt-center', targets = "_all"))
    )
  ) 
   return(data)
}
createDT_html <- function(DF, caption="", scrollY=400){
  print( htmltools::tagList( createDT(DF, caption, scrollY)) ) 
} 

Data Preprocessing

TSS Data

Use bioMart to get TSS positions for each gene

get_TSS_position <- function(gene){ 
  mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
  # att <- listAttributes(mart)
  # grep("transcription", att$name, value=TRUE)
  TSS <- getBM(mart=mart,
               attributes=c("hgnc_symbol","transcription_start_site", "version"), 
               filters="hgnc_symbol", values=gene)
} 

Import GWAS Summary Statistics

For each gene, get the position of top SNP (the one with the greatest effect size/Beta)

## Only the significant subset of results
Nalls_SS_sig <- read_excel("Data/Nalls2018_S2_SummaryStats.xlsx", sheet = "Data")
top_SNPs <- Nalls_SS_sig %>% group_by(`Nearest Gene`) %>% top_n(-1, `P, all studies`) #`Beta, all studies` 
top_SNPs <- cbind(Coord=paste(top_SNPs$CHR,top_SNPs$BP, sep=":"), top_SNPs)
createDT(Nalls_SS_sig, caption = "Nalls (2018): Table S2. Detailed summary statistics on all nominated risk variants, known and novel")
# Nall (2018) QTL Stats
# Nalls_QTL <- read_excel("Data/Nalls2018_S4_QTL-MR.xlsx") 
# createDT(Nalls_QTL, caption = "Nalls (2018) Table S4. Expanded summary statistics for QTL Mendelian randomization") 

Get Flanking SNPs

Get all genes surrounding the index SNP (default is 1Mb upstream + 1Mb downstream) Nalls et al. (2019) data: https://github.com/neurogenetics/meta5

get_flanking_SNPs <- function(gene, top_SNPs, bp_distance=1000000){ #1000000
  topSNP_sub <- subset(top_SNPs, `Nearest Gene`==gene)
  
  ## Full dataset  
  filePath <- "Data/META.PD.NALLS2014.PRS.tsv"# "Data/nallsEtal2019_no23andMe.tab.txt"
  # Get col names
  # strsplit( readLines(filePath, n = 2), "\t")   
  minPos <- topSNP_sub$BP - bp_distance
  maxPos <- topSNP_sub$BP + bp_distance
  
  geneSubset <- read.csv.sql(filePath, sep="\t", stringsAsFactors=F,
                           sql = paste('select * from file where CHR =',topSNP_sub$CHR,
                                       'AND POS BETWEEN', minPos, 'AND', maxPos)) 
  geneSubset <- geneSubset %>% dplyr::rename(SNP=MarkerName)
  return(geneSubset)
} 
# flankingSNPs <- get_flanking_SNPs("LRRK2", top_SNPs)

susieR

Gaston LD

  • Nalls et al. 2018: Imputation Panel Notes
    • “One of the limitations of this study is the use of multiple imputation panels, due to logistic constraints. Adding datasets from non-European populations would be helpful to further improve our granularity in association testing and ability to fine-map loci through integration of more variable LD signatures.”
    • “Post-Chang 23andMe samples were imputed using a combination of Finch for phasing (an in-house developed fork of Beagle) and miniMac2 for imputation with all-ethnicity samples from the September 2013 release of 1000 Genomes Phase1 as reference haplotypes.”
    • “The Nalls et al . 2014 and Chang et al . 2017 samples were imputed with Minimac2 using 1000 Genomes phase 1 haplotypes. All additional sample series except for the post-Chang et al . 2017 samples from 23andMe were imputed using the Haplotype Reference Consortium (HRC) on the University of Michigan imputation server under default settings with Eagle v2.3 phasing based on reference panel HRC r1.1 2016”
gaston_LD <- function(gene, flankingSNPs, reference="1KG_Phase1", superpopulation="EUR"){
 
  # Download portion of vcf from 1KG website
  region <- paste(flankingSNPs$CHR[1],":",min(flankingSNPs$POS),"-",max(flankingSNPs$POS), sep="")
  chrom <- flankingSNPs$CHR[1]
  if(reference=="1KG_Phase3"){
    cat("LD Reference Panel = 1KG_Phase3 \n")
    vcf_URL <- paste("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr",chrom,
                     ".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz",sep="") 
    
      if(!exists("popDat_1KGphase3")){
         popDat <- popDat_1KGphase3 <- read.delim("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel", header = T, row.names = NULL)
         colnames(popDat_1KGphase3) <- c("sample","population","superpopulation","gender")
      } else{popDat <- popDat_1KGphase3}
      
    
  } else if (reference=="1KG_Phase1") {
    cat("LD Reference Panel = 1KG_Phase1 \n")
    vcf_URL <- paste("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr",chrom,
           ".phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz", sep="")
    # Download individual-level metadata
     
      if(!exists("popDat_1KGphase1")){
         popDat <- popDat_1KGphase1  <- read.delim("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel", header = F, row.names = NULL)
         colnames(popDat) <- c("sample","population","superpopulation","seq_platform1","seq_platform2")
      } else { popDat <- popDat_1KGphase1 }
     
  
  }  
  
  # library(Rsamtools); #BiocManager::install("Rsamtools")
  
  system(paste("tabix -fh ",vcf_URL ,region, "> subset.vcf")) 
  vcf_name <- paste(basename(vcf_URL), ".tbi",sep="") 
  
  # Import w/ gaston and further subset
  bed.file <- read.vcf("subset.vcf")
  ## Subset rsIDs
  bed <- select.snps(bed.file, id %in% flankingSNPs$SNP)
  # Subset Individuals
  selectedInds <- subset(popDat, superpopulation %in% superpopulation) 
  bed <- select.inds(bed, id %in% selectedInds$sample)
  
  # Cleanup extra files
  rm(bed.file)
  file.remove(vcf_name)
  file.remove("subset.vcf")
  
  # Calculate pairwise LD for all SNP combinations
  ld.x <- gaston::LD(bed, lim = c(1,ncol(bed)))  
  # LD plot
  limit <- 20 
  LD.plot( ld.x[1:limit,1:limit], snp.positions = bed@snps$pos[1:limit])
  
  # Double check subsetting
  LD_matrix <- ld.x[row.names(ld.x) %in% flankingSNPs$SNP, colnames(ld.x) %in% flankingSNPs$SNP] 
  return(LD_matrix)
}
# LD_matrix <- gaston_LD("LRRK2",flankingSNPs) 

susieR Function

  • Notes on L parameter
    • L is the expected number of causal variants
    • Increasing L increases computational time
    • L=1: Gives a good amount of variation in PIP.
    • L=2: Warns “IBSS algorithm did not converge in 100 iterations!”, but gives good variation in PIP.
    • L=3: Warns “IBSS algorithm did not converge in 100 iterations!”. All PIPs 1s and 0s.
    • These results seem to be at least partially dependent on whether the ethnic composition of the LD matrix.
  • Notes on variance:
    • If ‘estimate_residual_variance’ = TRUE without providing ‘var_y’ and L>1, susieR will throw error: “Estimating residual variance failed: the estimated value is negative”
    • Running susieR with ‘var_y = var(b)’ provides exactly the same results.
  • Other fine-mapping tools included in susieR
    • susieR::N3finemapping.CAVIAR()
    • susieR::N3finemapping.DAP()
    • susieR::N3finemapping.FINEMAP()
  • Statistical Terms:
    • posterior inclusion probability (PIP)
    • coefficient estimate (Beta)
    • Effect allele frequency (EAF)
    • The I^2 statistic describes the percentage of variation across studies that seems not to be due to chance.
susie_on_gene <- function(gene, top_SNPs, reference="1KG_Phase1"){
  flankingSNPs <- get_flanking_SNPs(gene, top_SNPs)  
  ### Get LD matrix 
  LD_matrix <- gaston_LD(gene, flankingSNPs, reference) 
  ## Turn LD matrix into positive semi-definite matrix
  # LD_matrix2 <- ifelse(matrixcalc::is.positive.semi.definite(LD_matrix), 
  #        LDmatrix,
  #        Matrix::nearPD(LD_matrix)$mat %>% as.matrix() )
  
  ## Subset summary stats to only include SNPs found in query 
  geneSubset <- subset(flankingSNPs, SNP %in% row.names(LD_matrix)) 
  b <- geneSubset$Effect
  se <- geneSubset$StdErr 
  
  # Run Susie 
  fitted_bhat <- susie_bhat(bhat = b, shat = se,
                            R = LD_matrix, 
                            n = nrow(LD_matrix), 
                           
                            L = 1, # 1
                            scaled_prior_variance = 0.1, 
                            estimate_residual_variance = TRUE, # TRUE
                            estimate_prior_variance = FALSE, # FALSE
                            verbose = T,
                            
                            # var_y = var(b),
                            standardize = TRUE
                            ) 
  # Format results
  geneSubset$Coord <- paste(geneSubset$CHR, geneSubset$POS, sep=":")
  susieDF <- data.frame(SNP=names(fitted_bhat$X_column_scale_factors), PIP=fitted_bhat$pip) %>%
  base::merge(subset(geneSubset, select=c("SNP","Effect","P.value", "POS","Coord")), by="SNP") %>% 
    mutate(POS=as.numeric(POS))
  return(susieDF)
}  
 

# susieDF <- susie_on_gene("LRRK2", top_SNPs)
 
# write.csv(susieDF_save, "susieR_results_no.varY.csv", row.names = F, quote = F)
# write.csv(susieDF, "susieR_results_varY.csv", row.names = F, quote = F)

Plot susieR Results

before_after_plots <- function(gene, susieDF, topVariants=3){ 
  roundBreaks <- seq(plyr::round_any(min(susieDF$POS),10000), max(susieDF$POS),500000) 
  ## Before fine-mapping
  geneSubset <- susieDF %>% arrange(desc(abs(Effect)))
  labelSNPs_Effect <- geneSubset[1:topVariants,]
  yLimits <- c(min(-log(susieDF$Effect)), max(-log(susieDF$Effect))+1)
  
  before_plot <- ggplot(geneSubset, aes(x=POS, y=Effect, label=SNP, color= -log(P.value) )) + 
    ylim(yLimits) +
    geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
    geom_point() + 
    geom_point(data=labelSNPs_Effect[1,], pch=21, fill=NA, size=4, colour="red", stroke=1) + 
    geom_segment(aes(xend=POS, yend=0, color= -log(P.value)), alpha=.5) +
    geom_text_repel(data=labelSNPs_Effect, aes(label=SNP), segment.alpha = .2, nudge_y = .05, nudge_x = .5) + 
    labs(title=paste(gene," (",length(susieDF$PIP)," variants)","\nBefore Fine Mapping",sep=""), y="Beta", x="Position",
         color="GWAS\n-log(p-value)") +
    theme(plot.title = element_text(hjust = 0.5)) +
    scale_x_continuous(breaks = roundBreaks) 
  
  ## After fine-mapping 
  susieDF <- susieDF %>% arrange(desc(PIP))
  labelSNPs_PIP <- susieDF[1:topVariants,]
  yLimits <- c(min(susieDF$PIP), max(susieDF$PIP)+.1)
  
  after_plot <- ggplot(susieDF, aes(x=POS, y=PIP, label=SNP, color= -log(P.value) )) +  
    ylim(yLimits) +
    geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
    geom_point() +  
    geom_point(data=susieDF[susieDF$PIP == max(susieDF$PIP),], 
               pch=21, fill=NA, size=4, colour="green", stroke=1) +
    geom_segment(aes(xend=POS, yend=yLimits[1], color= -log(P.value))) +
    geom_text_repel(data=labelSNPs_PIP, aes(label=SNP), segment.alpha = .2, nudge_y = .05, nudge_x = .5) + 
    labs(title=paste(gene," (",length(susieDF$PIP)," variants)","\nAfter Fine Mapping",sep=""), y="PIP", x="Position",
         color="GWAS\n-log(p-value)") +
    theme(plot.title = element_text(hjust = 0.5)) +
    scale_x_continuous(breaks = roundBreaks)  
  
  plot_grid(before_plot, after_plot, nrow = 2) %>% print()  
  # susie_plot(fitted_bhat, y="PIP", b=b, add_bar = T)
  
  createDT_html(susieDF, paste("susieR Results: ", gene), scrollY = 200) 
}
# before_after_plots(susieDF)

Report SNP Overlap

Nalls_vs_SusieR_overlap <- function(Nalls_SS_sig, susieDF, max_SNPs=10){
    # Get top SNPs from Nalls et all fine mapping
    Nalls_dat  <- subset(Nalls_SS_sig, `Nearest Gene`==gene) %>% 
    arrange(`P, all studies`) %>%
    mutate(SNP=as.character(SNP))
    Nalls_SNPs <-if(max_SNPs > length(Nalls_dat$SNP)){ Nalls_dat$SNP}else{Nalls_dat$SNP[1:max_SNPs]}
  # Get top SNPs from susieR fine mapping
  susieR_dat <- subset(susieDF, PIP!=0) %>% 
    arrange(desc(PIP)) %>% 
    mutate(SNP=as.character(SNP))
  susieR_SNPs <-if(max_SNPs > length(susieR_dat$SNP)){ susieR_dat$SNP}else{susieR_dat$SNP[1:max_SNPs]}
  # Calculate percent
  overlap <-  length(intersect(Nalls_SNPs, susieR_SNPs))
  percentOverlap <- overlap / length(susieR_SNPs) * 100 
  cat("\n",overlap,"/",length(susieR_SNPs), " (",round(percentOverlap,2),
      "%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping. ","\n",sep="")  
  }
# Nalls_vs_SusieR_overlap(Nalls_SS_sig, susieDF, max_SNPs=10)

Fine Map with Summary Statistics

geneList <- c("LRRK2","GBAP1","SNCA","VPS13C","GCH1")#unique(top_SNPs$`Nearest Gene`) 
#Nalls_SS %>% group_by(`Nearest Gene`) %>% tally() %>% subset(n>2)

fineMapped_topSNPs <- data.table()
for (gene in geneList){ 
  cat("\n")
  cat("### ", gene, "\n")
  susieDF <- susie_on_gene(gene, top_SNPs, reference="1KG_Phase1")
  before_after_plots(gene, susieDF)
  Nalls_vs_SusieR_overlap(Nalls_SS_sig, susieDF, max_SNPs=10)
  
  # Create summary table for all genes
  newEntry <- cbind(data.table(Gene=gene), subset(susieDF, PIP==max(PIP)) %>% as.data.table())
  fineMapped_topSNPs <- rbind(fineMapped_topSNPs, newEntry)
  cat("\n")
}

LRRK2

LD Reference Panel = 1KG_Phase1 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-8132.45664962333” [1] “objective:-8132.43450380823” [1] “objective:-8132.43443487001” [1] “objective:-8132.4344346551”

## Warning in log(susieDF$Effect): NaNs produced

## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).

0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.

GBAP1

LD Reference Panel = 1KG_Phase1 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-4091.88469740352” [1] “objective:-4091.68730542752” [1] “objective:-4091.68707588275” [1] “objective:-4091.68707561522”

## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).

0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.

SNCA

LD Reference Panel = 1KG_Phase1 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-7013.97692378093” [1] “objective:-7012.50353136503” [1] “objective:-7012.50298538025” [1] “objective:-7012.50298518088”

## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced

0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.

VPS13C

LD Reference Panel = 1KG_Phase1 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-8099.36910165787” [1] “objective:-8099.34881477087” [1] “objective:-8099.34879629291” [1] “objective:-8099.34879627613”

## Warning in log(susieDF$Effect): NaNs produced

## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).

0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.

GCH1

LD Reference Panel = 1KG_Phase1 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-6952.39812371061” [1] “objective:-6952.38232172599” [1] “objective:-6952.38227859896” [1] “objective:-6952.38227848141”

## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).

0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.

Top SNPs

createDT(fineMapped_topSNPs, "Potential Causal SNPs Identified by susieR", scrollY = 200)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html